In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
# given some content...
content = ["How to format my hard disk", " Hard disk format problems "]
X = vectorizer.fit_transform(content)
feature_names = vectorizer.get_feature_names()
print("Feature names: {}".format(feature_names))
print(X.toarray().transpose())
Description of above data: Each column gives us a boolean (1 or 0) value letting us know if each word appears in the sentence (from content). Sentence 1 (content[0]) contains all words but "problems".
In [9]:
posts = [
"This is a toy post about machine learning. Actually, it contains not much interesting stuff.",
"Imaging databases provide storage capabilities.",
"Most imaging databases save images permanently.",
"Imaging databases store data.",
"Imaging databases store data. Imaging databases store data. Imaging databases store data.",
]
# Create a training set
vectorizer = CountVectorizer(min_df=1)
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: {}, #features: {}".format(num_samples, num_features))
print(vectorizer.get_feature_names())
# create a new post
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])
# a naive similarity measure (which uses the full ndarray of the new post)
import scipy as sp
def dist_raw(v1, v2):
delta = v1 - v2
# norm: Euclidean norm (shortest distance)
return sp.linalg.norm(delta.toarray())
# Find distances among all posts
import sys
def find_distances(vectorizer, new_post, posts, dist_func=dist_raw):
X_train = vectorizer.fit_transform(posts)
new_post_vec = vectorizer.traansform([new_post])
num_samples, num_features = X_train.shape
print("----------------------------------------")
print("#samples: {}, #features: {}".format(num_samples, num_features))
print(vectorizer.get_feature_names())
print("----------------------------------------")
best_dist = sys.maxsize
best_i = None
for i in range(0, num_samples):
post = posts[i]
if post == new_post:
continue
post_vec = X_train.getrow(i)
d = dist_func(post_vec, new_post_vec)
print("- Post %i with dist=%.2f: %s" % (i, d, post))
if d < best_dist:
best_dist = d
best_i = i
print("Best post is %i with dist=%.2f" % (best_i, best_dist))
find_distances(new_post, posts, X_train)
# explore the vectors for posts 3 & 4 since they all contain the same words
print("\nVectors for what should be similar sentences:")
print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())
In [10]:
# Normalize the vectors, and try again
def dist_norm(v1, v2):
v1_normalized = v1 / sp.linalg.norm(v1.toarray())
v2_normalized = v2 / sp.linalg.norm(v2.toarray())
delta = v1_normalized - v2_normalized
return sp.linalg.norm(delta.toarray())
find_distances(new_post, posts, X_train, dist_func=dist_norm)
In [11]:
# using stop words; i.e. removing "noise" / useless info
# use common english stop words (can also provide a list of specific words)
vectorizer = CountVectorizer(min_df=1, stop_words='english')
print("Some of our stop words: {}".format(", ".join(sorted(vectorizer.get_stop_words())[0:20])))
# construct a new training set
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: {}, #features: {}".format(num_samples, num_features))
print(vectorizer.get_feature_names())
In [12]:
# Using NLTK for stemming (reducing words to their specific word stem)
import nltk.stem as ns
s = ns.SnowballStemmer('english')
print(s.stem("graphics"))
print(s.stem("imaging"))
print(s.stem("image"))
print(s.stem("imagination"))
print(s.stem("imagine"))
In [13]:
# stem our posts before verctorizing
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super().build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(content)
X_train = vectorizer.fit_transform(posts)
print(vectorizer.get_feature_names())
find_distances(new_post, posts, X_train, dist_func=dist_norm)